

############ DATA PREP ############

## Load packages ----

pacman::p_load(sf,
               data.table, 
               lwgeom,
               RSQLite,
               tidyverse,
               lubridate,
               raster,
               rgis, #remotes::install_github("Pakillo/rgis")
               nngeo,
               rgdal,
               units,
               knitr, 
               readstata13,
               plm,
               dplyr,
               reshape2,
               biglm,
               lfe,
               haven,
               ggplot2,
               rgdal,
               spData,
               spDataLarge,
               tmap,
               tmaptools,
               leaflet, #most popular java script mapping library in the world - Google maps, etc
               estimatr,
               stargazer,
               styler, #highlight a code chunk and press CNTRL+SHIFT+A for formating 
               tidycensus)

#devtools::install_version("velox", version = "0.2.0")
#devtools::install_github("Pakillo/rgis")






######## Data Merging ############

## Set working directory
#Aleksandra
setwd('/Users/aleksandraconevska/Dropbox/Harvard/G1/2020/Courses/Gov-2001/Replication-Paper/Replication/data/')

#María
#setwd("/Users/mariaballesteros/Dropbox/Harvard/G1/GOV 2001/Replication/data/")



##### Loading and cleaning all election data for merging 2016 and 2020 

## Loading the datasets from the counties that were problematic in the 2020 official vote and registration totals 

vtotal_official <-
  read.csv("./raw/election/VOTES_WASHINGTON_County_Precinct/2020/TotalbyPrecinct.csv") ## The below counties had different precinct codes relative to
                                                                                       ## both 2016 and the fire data. Changes to convert 
                                                                                       ## them were based on the official list of precinct
                                                                                       ## names and codes given by WA sec of state office 
                                                                                       ## changes were completed in STATA. See do file 
                                                                                       ## titled "precinct-codeissues-2020officialdata" for reference

clark_offic <-
  read.csv(
    "./raw/election/VOTES_WASHINGTON_County_Precinct/2020/officialresults-bycounty/TotalClark.csv"
  )

king_offic <-
  read.csv(
    "./raw/election/VOTES_WASHINGTON_County_Precinct/2020/officialresults-bycounty/King/TotalKing-votesempty.csv"
  )

pierce_offic <-
  read.csv(
    "./raw/election/VOTES_WASHINGTON_County_Precinct/2020/officialresults-bycounty/TotalPierce.csv"
  )

snohom_offic <-
  read.csv(
    "./raw/election/VOTES_WASHINGTON_County_Precinct/2020/officialresults-bycounty/TotalSnohomish.csv"
  )


## Removing problematic counties from the official precinct level dataset to then re add them with the right precinct codes 
vtotal_official2 <-
  vtotal_official[!(vtotal_official$cocountyname == "King"),]

vtotal_official3 <-
  vtotal_official2[!(vtotal_official2$cocountyname == "Pierce"),]

vtotal_official4 <-
  vtotal_official3[!(vtotal_official3$cocountyname == "Snohomish"),]

vtotal_official5 <-
  vtotal_official4[!(vtotal_official4$cocountyname == "Clark"),]

vtotal_official5 <- dplyr::select(vtotal_official5,-c(id))


## Cleaning King county individual excel file to bind with the official
king_offic <-
  dplyr::select(king_offic,-c(precinctcode, precinctpart, nvals, X_merge, countycode))

king_offic$PrecinctCode <- king_offic$precinctcodeoffical
king_offic <- dplyr::select(king_offic, -c(precinctcodeoffical))

king_offic$cocountyname <- king_offic$county
king_offic <- dplyr::select(king_offic, -c(county))


## Cleaning Pierce county individual excel file to bind with the official
pierce_offic <-
  dplyr::select(pierce_offic,-c(precinctcode, precinctpart, nvals, X_merge, countycode))
pierce_offic$PrecinctCode <- pierce_offic$precinctcodeofficial

pierce_offic <-
  dplyr::select(pierce_offic, -c(precinctcodeofficial))

pierce_offic$cocountyname <- pierce_offic$county
pierce_offic <- dplyr::select(pierce_offic, -c(county))


## Cleaning Clark county individual excel file to bind with the official
clark_offic <-
  dplyr::select(clark_offic,-c(precinctcode, precinctpart, nvals, X_merge, countycode))

clark_offic$PrecinctCode <- clark_offic$precinctcodeofficial
clark_offic <- dplyr::select(clark_offic, -c(precinctcodeofficial))

clark_offic$cocountyname <- clark_offic$county
clark_offic <- dplyr::select(clark_offic, -c(county))


## Cleaning Snohom county individual excel file to bind with the official
snohom_offic <-
  dplyr::select(snohom_offic,-c(precinctcode, precinctpart, nvals, X_merge, countycode))

snohom_offic$PrecinctCode <- snohom_offic$precinctcodeofficial
snohom_offic <-
  dplyr::select(snohom_offic, -c(precinctcodeofficial))

snohom_offic$cocountyname <- snohom_offic$county
snohom_offic <- dplyr::select(snohom_offic, -c(county))


## Changing precinct names to character
snohom_offic$precinctname <- as.character(snohom_offic$precinctname)
king_offic$precinctname <- as.character(king_offic$precinctname)
clark_offic$precinctname <- as.character(clark_offic$precinctname)
pierce_offic$precinctname <- as.character(pierce_offic$precinctname)

## Changing regvoters to numeric
vtotal_official5$regvoters <- as.numeric(vtotal_official5$regvoters)
king_offic$regvoters <- as.numeric(king_offic$regvoters)


## Combining to one dataset
vtotal_official_clean <-
  dplyr::bind_rows(
    mutate(clark_offic, preccodesource = "list"),
    mutate(king_offic, preccodesource = "list"),
    mutate(snohom_offic, preccodesource = "list"),
    mutate(pierce_offic, preccodesource = "list"),
    mutate(vtotal_official5, preccodesource = "official")
  )
##write.csv(vtotal_official_clean, "prec_totaloffic_cleaned_2020")


## Creating abbreviated county name in 2020 for eventual merge with 2016 
county_names = c("Adams","Asotin","Benton","Chelan", "Clallam", 
                 "Clark", "Columbia", "Cowlitz", "Douglas", "Ferry", 
                 "Franklin", "Garfield","Grant","Grays Harbor","Island",
                 "Jefferson","King","Kitsap","Kittitas","Klickitat","Lewis",
                 "Lincoln","Mason","Okanogan","Pacific","Pend Oreille","Pierce",
                 "San Juan","Skagit","Skamania","Snohomish","Spokane","Stevens",
                 "Thurston","Wahkiakum","Walla Walla","Whatcom","Whitman","Yakima")

counties_match = county_names 
abs = c("AD","AS","BE","CH","CM","CR","CU","CZ","DG","FE","FR","GA","GR","GY","IS","JE",
        "KI","KP","KS","KT","LE","LI","MA","OK","PA","PE","PI","SJ","SK","SM","SN","SP",
        "ST","TH","WK","WL","WM","WT","YA")


counties_match = as.data.frame(counties_match)
counties_match$abs = abs
colnames(counties_match) = c("County","Abs")
counties_match$cocountyname = counties_match$County
counties_match$CountyName = counties_match$Abs

vtotal_official_clean = left_join(vtotal_official_clean, counties_match, by = "cocountyname")



#### Collapsing and merging 2020 individual level voter data to obtain additional demographic variables

## Loading in the individual level dta - manually imported the txt file into STATA due to the size of the txt file 
dtaindv <-
  read.dta13("./raw/election/VOTES_WASHINGTON_County_Precinct/2020/reg_voter_complete.dta")

## Create binary variables for the voters that we care about

#gender
dtaindv$fem <- ifelse(dtaindv$gender == "F", 1, 0)
dtaindv$male <- ifelse(dtaindv$gender == "M", 1, 0)

dtaindv$gender_unknown <-
  ifelse(
    dtaindv$gender == "" |
      dtaindv$gender == "U" |
      dtaindv$gender == "O" |
      dtaindv$gender == " " | dtaindv$gender == "WA",
    1,
    0
  )

# Age
election_day <- as.Date("2020-11-03")
dtaindv$birthdate <- as.Date(dtaindv$birthdate)
#generates age on election day
dtaindv$age <-
  as.period(interval(start = dtaindv$birthdate, end = election_day))$year
# Age interval 18 to 30
dtaindv$age18_30 <- ifelse(dtaindv$age <= 30, 1, 0)
# Age interval 30-45
dtaindv$age30_45 <- ifelse(dtaindv$age > 30 &
                             dtaindv$age <= 45, 1, 0)
# Age interval 45-65
dtaindv$age45_65 <- ifelse(dtaindv$age > 45 &
                             dtaindv$age <= 65, 1, 0)
# Age interval 65+
dtaindv$age65plus <- ifelse(dtaindv$age > 65, 1, 0)

## voted 2020
dtaindv$lastvoted <- as.Date(dtaindv$lastvoted)

dtaindv$voted2020 <- ifelse(year(dtaindv$lastvoted) == "2020", 1, 0)
dtaindv$totalreg <- 1

## gender and voted
dtaindv$fem_voted <- ifelse(dtaindv$gender == "F" & dtaindv$voted2020 ==1, 1, 0)
dtaindv$male_voted <- ifelse(dtaindv$gender == "M"& dtaindv$voted2020 ==1, 1, 0)
dtaindv$young_voted <- ifelse(dtaindv$age18_30 == 1 & dtaindv$voted2020 ==1, 1, 0)

## Generating unique id to match fire data
dtaindv$FullPrc <-
  paste0(dtaindv$countycode,
         str_pad(dtaindv$precinctcode, 8, side = "left", pad = "0"))

dtaindv_collapse <- dtaindv %>%
  group_by(FullPrc, countycode, precinctcode) %>%
  summarise_at(
    vars(
      totalreg,
      voted2020,
      age18_30,
      age30_45,
      age45_65,
      age65plus,
      male,
      fem,
      gender_unknown,
      male_voted,
      fem_voted, 
      young_voted
    ),
    sum,
    na.rm = T
  ) %>%
  drop_na(any_of(c("countycode", "precinctcode"))) %>%
  filter(precinctcode != "NA" |
           precinctcode != "" | countycode != "NA" | countycode != "")

individual <-
  dtaindv_collapse %>% dplyr::select("FullPrc", "countycode", "precinctcode") %>%
  arrange(countycode)

## Checking for duplicates in collapsed individual level data
dup <-
  dtaindv_collapse[!duplicated(dtaindv_collapse$FullPrc),]

dtaindv_collapse$CountyName <- dtaindv_collapse$countycode



## Collapse 2020 results from the King County website (where total votes cast are not just 0s as in the SOS official data) 
# into total Presidential votes cast by precinct to then merge onto the official SOS data with the rest of the counties 

off_res = read.csv(
  './raw/election/VOTES_WASHINGTON_County_Precinct/2020/officialresults-bycounty/King/King-final-precinct-results.csv'
)

pre_name = read.csv(
  './raw/election/Voter Registration Washington/official-precinct-names2020/2020.12.01-Districts_Precincts-Names.csv'
)

## drop all non-Presidential votes from official results
off_res = off_res[off_res$Race == "State of Washington US President & Vice President", ]

## drop to just the "Total Counted" results -- this is the sum of
## all votes cast in that precinct
off_res = off_res[off_res$CounterType == "Times Counted", ]

## We now have the data for just the total vote counts by precinct.
## Need to now append on the correct precinct code variable for each precinct from the 2020 official precicnt names file 
## (this data uses precinct name instead of code to uniquely identify county-precincts)

## create County Code variable to facilitate merge
off_res$CountyCode = "KI"

## Rename precinct variable to facilitate merge
colnames(off_res)[1] = "PrecinctName"

## drop non-King county precincts from precinct names dataset
pre_name = pre_name[pre_name$CountyCode == "KI", ]

## merge based on County Code and Precinct Name conditions
off_res = merge(
  x = off_res,
  y = pre_name[, c("CountyCode", "PrecinctName", "PrecinctCode")],
  by = c("CountyCode", "PrecinctName"),
  all.x = FALSE
)

## This yielded a number of duplicates based on the number of rows in the
## original precinct names dataset. Remove duplicate rows.
King_2020_offic_res = off_res[!duplicated(off_res), ]

#Clean to have only variables needed
King_2020_offic_res$CountyName <- King_2020_offic_res$CountyCode

King_2020_offic_res <-
  dplyr::select(
    King_2020_offic_res,
    -c(
      PrecinctName,
      Race,
      LEG,
      CC,
      CG,
      CounterGroup,
      Party,
      CounterType,
      CountyCode
    )
  )

## Merging new King dataframe with official data
vtotal_official_clean <-
  left_join(vtotal_official_clean,
            King_2020_offic_res,
            by = c("CountyName", "PrecinctCode"))


## Replacing NAs in the King County vote count var (NAs are from non-King counties) with values from official
vtotal_official_clean$SumOfCount[is.na(vtotal_official_clean$SumOfCount)] <-
  vtotal_official_clean$ballotscast[is.na(vtotal_official_clean$SumOfCount)]

vtotal_official_clean <-
  dplyr::select(vtotal_official_clean,
                -c(ballotscast, Abs, County))

vtotal_official_clean$ballotscast <-
  vtotal_official_clean$SumOfCount

vtotal_official_clean <-
  dplyr::select(vtotal_official_clean,
                -c(SumOfCount))




## Merging collapsed individual level into official 2020 precinct level data for the demographic variables 
dtaindv_collapse$PrecinctCode <- dtaindv_collapse$precinctcode

vtotal_official_clean <-
  left_join(vtotal_official_clean,
            dtaindv_collapse,
            by = c("CountyName", "PrecinctCode"))


vtotal_official_clean <-
  dplyr::select(vtotal_official_clean,
                -c(totalreg, voted2020))




## Loading in the 2016 data to merge with 2020
vtotal_official2016 <-
  read.csv("./raw/election/VOTES_WASHINGTON_County_Precinct/2016/2016-PrecinctTotals.csv")


## Removing duplicates in 2016
vtotal_official2016$unique <-
  paste0(vtotal_official2016$CountyCode,
         as.character(vtotal_official2016$PrecinctCode))

vtotal_official2016 <-
  vtotal_official2016[!duplicated(vtotal_official2016$unique),]

write.csv(vtotal_official2016, '2016-PrecinctsTotals-Unique-R.csv')

## Preparing for Merging with 2020 
vtotal_official2016$CountyName <- vtotal_official2016$CountyCode
vtotal_official2016 <-
  dplyr::select(vtotal_official2016,-c(CountyCode))
vtotal_official2016$CountyCode <- vtotal_official2016$id

vtotal_official2016$RegVoters2016 <- vtotal_official2016$RegVoters
vtotal_official2016$Voted2016 <- vtotal_official2016$Voted
vtotal_official2016 <-
  dplyr::select(vtotal_official2016,-c(RegVoters, Voted))

## Merging 2016 and 2020 official election data 
vofficial_1620 <- left_join(vtotal_official2016, vtotal_official_clean, by=c("CountyName","PrecinctCode"))







#### Loading fire exposure and occupation controls data to prepare to merge with election
fire <- readRDS("./processed/BA_PRECINCT_INT/fire.rds") #this file includes all of the fire exposure variables
                                                        #see geo-spatial folder for detailed code on how it was obtained

occupation <-
  read.dta13("./raw/occupation/occupation.dta") #for more detail on this data, see the Beland, Brodeur, Wright (2020)
#paper, titled "Beland-Brodeur-Wright-2020-CPSdata.pdf" in the
#references folder
codes_2019 <-
  read.csv(
    "./raw/election/VOTES_WASHINGTON_County_Precinct/2019.01.16-districts_precincts.csv"
  )

fips_to_county <- read.csv("./raw/election/FIPS to county codes.csv")


### Adding leading 0s to state codes in occupation data
occupation$FIPS <-
  ifelse(
    nchar(occupation$fips_new) == 4,
    paste0(0, occupation$fips_new),
    occupation$fips_new
  )


## Generating state and county codes
occupation$state_code <- substr(occupation$FIPS, 1, 2)
occupation$county_code <- substr(occupation$FIPS, 3, 5)


## Filtering only Washington and Needed variables
occupation <- filter(occupation, state_code == "53")
occupation <-
  occupation %>% dplyr::select("exposure", "proximity", "remindex", "crit_work", "FIPS")


## Adding county names to fire data
countynames <-
  tibble(CountyCd = unique(codes_2019$CountyCode),
         CountyName = unique(codes_2019$County))
fires <- left_join(fire, countynames, by = "CountyName")


## Adding fips codes to fire data

fips_to_county$CountyName <- fips_to_county$Name
fires <-
  left_join(fires, filter(fips_to_county, State == "WA"), by = "CountyName")


## Fips codes from strings to numeric
fires$FIPS <- as.numeric(fires$FIPS)
occupation$FIPS <- as.numeric(occupation$FIPS)


## Merging occupation data to fires data by fips
fire_occup <- left_join(fires, occupation, by = "FIPS")


## Generating census variables and merging
census_api_key("1258044fe3c67761d230b11706096162754ab636")


## Exporting data from ACS
census <-
  get_acs(
    geography = "county",
    variables = "B01003_001",
    state = "WA",
    geometry = T
  )
census$urban <- ifelse(census$estimate >= 50000, 1, 0)
census <-
  tibble(FIPS = census$GEOID,
         pop = census$estimate,
         urban = census$urban)
census$FIPS <- as.numeric(census$FIPS)

fire <- left_join(fire_occup, census, by = "FIPS")

## Preparing fire data to merge on  
fire$CountyName <- fire$CountyCd
fire$PrecinctCode <- gsub("-", "", fire$PrecCode)
fire$PrecinctCode <- as.numeric(fire$PrecinctCode)




#### Merging fire data with full official election data 
fire$FullPrc <- fire$St_Code
votefire_1620 <- left_join(vofficial_1620, fire, by=c("FullPrc"))










######### Data cleaning ############


## Removing raw variable type 
votefire_1620 <- dplyr::select(votefire_1620, -c(geom))


## Renaming varnames 
votefire_1620$regvoters2020 <- votefire_1620$regvoters
votefire_1620$ballotscast2020 <- votefire_1620$ballotscast

votefire_1620$regvoters2016 <- votefire_1620$RegVoters2016
votefire_1620$ballotscast2016 <- votefire_1620$Voted2016

## Removing unnecessary vars
votefire_1620 <-
  dplyr::select(votefire_1620,
                -c(RegVoters2016, Voted2016))



## Creating outcome 
votefire_1620$turnout2016 <- votefire_1620$ballotscast2016/votefire_1620$regvoters2016
votefire_1620$turnout2020 <- votefire_1620$ballotscast2020/votefire_1620$regvoters2020


## Creating fire distance measure in km 
votefire_1620$PrecDist5000Fire_km <- votefire_1620$PrecDistTo5000Fire/1000

write.csv(votefire_1620, "./processed/VOTING/prectotal_offic_1620.csv")

test$PrecDist5000Fire_km_new <- test$PrecDistTo5000Fire.y/1000



##
delvars <- c('turnout2016', 'turnout2020', 'BAWithin0to5km',
             'WHPMean', 'crit_work', 'remindex', 'proximity',
             'exposure', 'urban', 'FIPS')

votefire1620_complete <- votefire_1620[complete.cases(votefire_1620[ , delvars]),] #this allows us to run the models without imputation



# 
# ## PLAYING WITH NEW DATA
# 
# #loading UPDATED DATA and merging with Rmd file 
# fire <- readRDS("./processed/BA_PRECINCT_INT/fireupdate.rds") #this file includes all of the fire exposure variables

# fires$PrecinctCode <- fires$PrecCode
# fires$CountyName <- fires$CountyCd
# fires$CountyName <- as.character(fires$CountyCd)
# test <- left_join(votefire_1620_comp, fires, by=c("CountyName", "PrecinctCode"))
# 
# 
# m4 <-  lm(
#   turnout2020 ~ PrecDist5000Fire_km_new  + WHPMean.y + crit_work +
#     remindex + proximity + exposure + urban + pop,
#   data = test
# ) 


